In [1]:
import struct
import os
import numpy as np
import pandas as pd
import random
import plotly.graph_objects as go
import math
In [6]:
COMP_FILE = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng/sample_embeddings_001825.comparisons"
COMP_FILE_Q001 = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_001825.comparisons"
COMP_FILE_Q001_04 = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01_margin0.4/sample_embeddings_q0.01_margin0.4_002281.comparisons"
COMP_FILE_Q0001 = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_002281.comparisons"
COMP_FILE_Q001_LSTM40_3LAYERS = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_lstm40_3layers_002281.comparisons"

COMP_FILE_Q001_BIG = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_big_002281.comparisons"
COMP_FILE_Q0001_BIG = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_002281.comparisons"

COMP_FILE_Q0001_BIG_LSTM40_3LAYER = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_002281.comparisons"

COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M04 = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_margin0.4_002281.comparisons"
COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M05 = "/media/eduseiti/Seagate Expansion Drive/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_margin0.5_002281.comparisons"

COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M048 = "/mnt/f633ac7c-3153-4566-a009-229a0ae5f8a1/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001_margin0.48/sample_embeddings_q0.001_big_lstm40_3layer_margin0.48_002281.comparisons"

BASE_PVALUE_FOLDER="/media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue"
COMP_ALL_PVALUE_10="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_002281.comparisons"
COMP_ALL_PVALUE_10_LOG_SCALING="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_log_scaling_002281.comparisons"

COMP_ALL_PVALUE_10_TEST="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_test_002281.comparisons"

COMP_ALL_PVALUE_10_WINSORIZING="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_winsorizing_002281.comparisons"

COMP_ALL_PVALUE_10_IDENTIFICATIONS_FIX="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_identifications_fix_002281.comparisons"

COMP_ALL_PVALUE_10_CELL_STATE="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_cell_state_002281.comparisons"

COMP_ALL_PVALUE_10_CELL_STATE_NO_WINSORIZING="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_cell_state_no_winsorizing_002281.comparisons"

COMP_ALL_PVALUE_10_CELL_STATE_WINSORIZING_BUT_LINFENG="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_cell_state_winsorizing_002281.comparisons"

COMP_ALL_PVALUE_10_N_PAIR="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_n_pair_002281.comparisons"

COMP_ALL_PVALUE_10_N_PAIR_2="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_n_pair_dl-16_002281.comparisons"

PVALUE_10_N_PAIR_LSTM50="sample_embeddings_q0.01_all_lstm50_3layer_pvalue_0.1_n_pair_002281.comparisons"

STRUCT_FIELDS = "BIBId"
In [3]:
def decode_comparisons_file(comparisons_filename):
    
    comparisons = []

    with open(comparisons_filename, "rb") as inputFile:
        while True:
            record = inputFile.read(struct.calcsize(STRUCT_FIELDS))

            if not record:
                break
            else:
                unpacked = struct.unpack_from(STRUCT_FIELDS, record)
                
                comparisons.append(unpacked)
                
                if math.isnan(unpacked[4]):
                    print("nan: {}".format(record))

    print("Decoded {} comparisons from {}".format(len(comparisons), comparisons_filename))
    
    return np.array(comparisons)
In [4]:
def plot_comparissons_histogram(comparisons_filename):
    
    comparisons = decode_comparisons_file(comparisons_filename)
    comparisons_df = pd.DataFrame(comparisons, columns = ["file_1", "scannr_1", "file_2", "scannr_2", "cosine_similarity"])
    
    print(comparisons_df['cosine_similarity'].describe(percentiles=list(np.round(np.arange(0.0, 1.0, 0.05), 2))))
    
    cosSim_histogram, costSim_bin_edges = np.histogram(comparisons_df['cosine_similarity'].loc[list(random.sample(range(len(comparisons)), int(len(comparisons) * 0.1)))], 1000)

    fig = go.Figure()

    fig.add_trace(go.Bar(y=cosSim_histogram,
                         x=costSim_bin_edges[1:],
                         marker_color='red'))
    
    fig.show()
    
    return comparisons_df, cosSim_histogram, costSim_bin_edges

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, using N-pair loss and LSTM50

In [5]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, PVALUE_10_N_PAIR_LSTM50))
Decoded 45181026 comparisons from /media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue/sample_embeddings_q0.01_all_lstm50_3layer_pvalue_0.1_n_pair_002281.comparisons
count    4.518103e+07
mean     9.286434e-01
std      4.471363e-02
min      6.227846e-01
0%       6.227846e-01
5%       8.454221e-01
10%      8.670742e-01
15%      8.811850e-01
20%      8.920314e-01
25%      9.010225e-01
30%      9.088710e-01
35%      9.159672e-01
40%      9.225408e-01
45%      9.287578e-01
50%      9.347307e-01
55%      9.405526e-01
60%      9.462993e-01
65%      9.520451e-01
70%      9.578486e-01
75%      9.637709e-01
80%      9.698239e-01
85%      9.759886e-01
90%      9.822897e-01
95%      9.891713e-01
max      9.996473e-01
Name: cosine_similarity, dtype: float64

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, using N-pair loss ― bigger batches

In [7]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_N_PAIR_2))
Decoded 45181026 comparisons from /media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue/sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_n_pair_dl-16_002281.comparisons
count    4.518103e+07
mean     9.479722e-01
std      3.386621e-02
min      6.989008e-01
0%       6.989008e-01
5%       8.844660e-01
10%      9.015997e-01
15%      9.125054e-01
20%      9.208004e-01
25%      9.276815e-01
30%      9.336800e-01
35%      9.390943e-01
40%      9.440989e-01
45%      9.488215e-01
50%      9.533402e-01
55%      9.577192e-01
60%      9.619920e-01
65%      9.661964e-01
70%      9.703541e-01
75%      9.744870e-01
80%      9.786449e-01
85%      9.829197e-01
90%      9.874227e-01
95%      9.922686e-01
max      9.998141e-01
Name: cosine_similarity, dtype: float64

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, using N-pair loss

In [5]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_N_PAIR))
Decoded 45181026 comparisons from /media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue/sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_n_pair_002281.comparisons
count    4.518103e+07
mean     9.293182e-01
std      4.690314e-02
min      5.952803e-01
0%       5.952803e-01
5%       8.427855e-01
10%      8.649763e-01
15%      8.793784e-01
20%      8.905106e-01
25%      8.998085e-01
30%      9.080038e-01
35%      9.154798e-01
40%      9.224991e-01
45%      9.292031e-01
50%      9.357046e-01
55%      9.420836e-01
60%      9.484060e-01
65%      9.547213e-01
70%      9.610534e-01
75%      9.673563e-01
80%      9.735389e-01
85%      9.794855e-01
90%      9.852591e-01
95%      9.912386e-01
max      9.998494e-01
Name: cosine_similarity, dtype: float64

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, using cell state and winsorizing at 99%, but no winsorizing in linfeng

In [ ]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_CELL_STATE_WINSORIZING_BUT_LINFENG))

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, using cell state

In [ ]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_CELL_STATE_NO_WINSORIZING))

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, using cell state

In [ ]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_CELL_STATE))

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, the network fix, Winsorizing and identifications fix

In [ ]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_IDENTIFICATIONS_FIX))

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, the network fix, and Winsorizing

In [ ]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_WINSORIZING))

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10% and the network fix

In [ ]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_TEST))

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10% and applying log scaling on the spectra intensities

In [ ]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_LOG_SCALING))

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%

In [ ]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10))
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Similarities sample (10%) histogram of clustering using all distances

In [ ]:
_, embeddings_hist, _ = plot_comparissons_histogram(COMP_FILE)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01

In [ ]:
_, embeddings_q001_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001

In [ ]:
_, embeddings_q0001_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and margin 0.4

In [ ]:
_, embeddings_q001_04_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_04)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and a LSTM 40 3 layers model

In [ ]:
_, embeddings_q001_lstm40_3layers_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_LSTM40_3LAYERS)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 with bigger training dataset

In [ ]:
_, embeddings_q001_big_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_BIG)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset

In [ ]:
_, embeddings_q0001_big_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model

In [ ]:
_, embeddings_q0001_big_lstm40_3layer_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model trained with margin 0.4

In [ ]:
_, embeddings_q0001_big_lstm40_3layer_margin04_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M04)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model trained with margin 0.5

In [ ]:
_, embeddings_q0001_big_lstm40_3layer_margin05_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M05)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model trained with margin 0.48

In [ ]:
_, embeddings_q0001_big_lstm40_3layer_margin048_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M048)
In [ ]: